In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )
Since the dataset for this project is so small, a hold-out set will not be used, and only k-fold testing and training splits will be used to measure accuracy.
This is because even with a stratified hold-out set of 20%, with only 146 data points, lots of missing data and and 18 poi's, there would be only 3 or so points to do a final test on. This does not give much confidence in the precision of the performance metrics on such a small hold-out set, while also negatively impacting the ability to create the model.
"when the number of samples is not large, a strong case can be made that a test set should be avoided because every sample may be needed for model building. (...) Additionally, the size of the test set may not have sufficient power or precision to make reasonable judgements. "
[1] Kuhn M., Kjell J.(2013). Applied Predictive Modeling. Springer. pp.67
Hawkins et al. (2003) concisely summarize this point:“holdout samples of tolerable size [. . . ] do not match the cross-validation itself for reliability in assessing model fit and are hard to motivate.”
[2] Hawkins D, Basak S, Mills D (2003). “Assessing Model Fit by Cross– Validation.” Journal of Chemical Information and Computer Sciences, 43(2), 579–586
This will be addressed with K-fold cross-validation resampling techniques.
In [4]:
my_dataset = data_dict
In [5]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
'NaN' was imported as a string instead of a a missing value. We will convert these to NaN type and look how many missing values our data has.
In [6]:
# Replace 'NaN' strings with actual np.nan values
df = df.replace('NaN', np.nan)
# Replace email strings with True/False boolean as to whether an email was present or not
df['email_address'] = df['email_address'].fillna(0).apply(lambda x: x != 0, 1)
In [6]:
df.info()
In [7]:
df[df['salary'] > 1000000]
df[df.index == 'TOTAL']
df = df.drop('TOTAL', axis=0)
In [393]:
Out[393]:
In [8]:
# df.pivot(index=df.index, columns='poi')
df.columns
Out[8]:
In [118]:
cols = [x for x in df.columns if x not in ['restricted_stock_deferred', 'loan_advances', 'director_fees']]
In [119]:
for each in cols:
g = sns.FacetGrid(df, col='poi', margin_titles=True, size=6)
g.map(plt.hist, each, color='steelblue')
In [114]:
print cols
In [10]:
df2 = df[['poi', 'exercised_stock_options']]
df2['log_eso'] = df['exercised_stock_options'].apply(np.log)
In [11]:
g = sns.FacetGrid(df2, col='poi', margin_titles=True, size=6)
g.map(plt.hist, 'log_eso', color='steelblue', bins=25)
Out[11]:
In [12]:
df.columns
Out[12]:
In [13]:
df2 = df
df2['ratio_messages'] = df['from_messages']/df['to_messages']
In [14]:
g = sns.FacetGrid(df2, col='poi', margin_titles=True, size=6)
g.map(plt.hist, 'ratio_messages', color='steelblue', bins=20)
Out[14]:
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.base import TransformerMixin
from pandas import DataFrame
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.naive_bayes import GaussianNB
In [16]:
my_imputer = Imputer(missing_values = 'NaN', strategy='median', axis=0)
In [17]:
my_scaler = StandardScaler(with_mean=True, with_std=True)
By default, the GridSearchCV uses a 3-fold cross-validation. However, if it detects that a classifier is passed, rather than a regressor, it uses a stratified 3-fold.
http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html
In [18]:
from sklearn.base import TransformerMixin
class ColumnExtractor(TransformerMixin):
'''
Columns extractor transformer for sklearn pipelines.
Inherits fit_transform() from TransformerMixin, but this is explicitly
defined here for clarity.
Methods to extract pandas dataframe columns are defined for this class.
'''
def __init__(self, columns=[]):
self.column = columns
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def transform(self, X, **transform_params):
'''
Input: A pandas dataframe and a list of column names to extract.
Output: A pandas dataframe containing only the columns of the names passed in.
'''
return X[self.columns]
def fit(self, X, y=None, **fit_params):
return self
In [19]:
class DenseTransformer(TransformerMixin):
'''
to_dense() transformer for sklearn pipelines.
Inherits fit_transform() from TransformerMixin, but this is explicitly
defined here for clarity.
Methods to apply to_dense to the pandas dataframe are defined for this class.
'''
def transform(self, X, y=None, **fit_params):
'''
Input: A pandas dataframe.
Output: A pandas dataframe with to_dense applied.
'''
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [20]:
from pandas import DataFrame
class ModelTransformer(TransformerMixin):
'''
Transformer for sklearn pipeline which applies a prediction model to the input.
Inherits fit_transform() from TransformerMixin.
Methods to apply a model transformation to the input are defined for this class.
e.g. Apply Kmeans clustering to X data and output clusters to be used as features
for a subsequent model.
'''
def __init__(self, model):
'''
Initialize with model to be used fit/transform methods.
'''
self.model = model
def fit(self, *args, **kwargs):
'''
Fit model using models' required *args.
'''
self.model.fit(*args, **kwargs)
return self
def transform(self, X, **transform_params):
'''
Input: pandas DataFrame.
Output: pandas DataFrame of predictions from model.
'''
return DataFrame(self.model.predict(X))
In [21]:
class HoursOfDayTransformer(TransformerMixin):
'''
Transformer for sklearn pipeline which extracts the hours from a 'DateTime' column
of a pandas dataframe.
Inherits fit_transform() from TransformerMixin.
Methods to extract the hours from a 'DateTime' column of pandas dataframe are defined
for this class.
'''
def transform(self, X, **transform_params):
'''
Input: pandas DataFrame with ['datetime] column to extract hours from.
Output: pandas DataFrame of the extracted hours.
'''
hours = DataFrame(X['datetime'].apply(lambda x: x.hour))
return hours
def fit(self, X, y=None, **fit_params):
'''
Nothing to fit, return self.
'''
return self
In [21]:
In [21]:
In [22]:
from sklearn.preprocessing import Binarizer
In [23]:
df.columns
Out[23]:
In [24]:
binarizer = Binarizer(threshold=1000000)
In [25]:
binarizer.fit_transform(df['exercised_stock_options'].fillna(0))
Out[25]:
In [45]:
Out[45]:
In [71]:
X_df = df2.drop('poi', axis=1)
y_df = df2['poi']
In [12]:
In [13]:
In [14]:
In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import LeaveOneOut
from sklearn.cross_validation import StratifiedShuffleSplit
In [23]:
sk_fold = StratifiedShuffleSplit(y_df, n_iter=10, test_size=0.1)
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
pipeline = Pipeline([
('imputer', Imputer(missing_values = 'NaN', axis=0)),
('standardizer', StandardScaler(with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=.10)),
# ('pca', PCA()),
('ET', ExtraTreesClassifier(oob_score=True, bootstrap=True) )
])
params = {'ET__n_estimators': [1500],
'ET__max_features': ['auto', None, 3, 5, 10, 15],
'ET__min_samples_split': [2, 4, 10],
'ET__min_samples_leaf': [1, 2, 5],
'ET__criterion' : ['gini', 'entropy'],
'imputer__strategy': ['median', 'mean']}
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs = -1, cv=3, scoring='f1')
grid_search.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
print "F1: ", f1_score(y_test, test_pred)
print "Confusion Matrix: "
print confusion_matrix(y_test, test_pred)
print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
print ""
In [ ]:
F1: 0.666666666667
Confusion Matrix:
[[13 0]
[ 1 1]]
Accuracy Score: 0.933333333333
Best Params: {'ET__n_estimators': 1500, 'ET__criterion': 'gini', 'ET__max_features': None, 'imputer__strategy': 'median', 'ET__min_samples_split': 10, 'ET__min_samples_leaf': 1}
Best Estimator: Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('low_var_remover', VarianceThreshold(threshold=0.1)), ('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
...les_split=4, n_estimators=1500, n_jobs=1,
oob_score=True, random_state=None, verbose=0))])
In [ ]:
F1: 0.666666666667
Confusion Matrix:
[[13 0]
[ 1 1]]
Accuracy Score: 0.933333333333
Best Params: {'ET__n_estimators': 1500, 'ET__criterion': 'gini', 'ET__max_features': 'auto', 'imputer__strategy': 'mean', 'ET__min_samples_split': 2, 'ET__min_samples_leaf': 1}
Best Estimator: Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)), ('low_var_remover', VarianceThreshold(threshold=0.1)), ('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,...les_split=2, n_estimators=1500, n_jobs=1,
oob_score=True, random_state=None, verbose=0))])
In [163]:
n_iter = 100
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
grid_search = Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=0.1)),
('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
n_estimators=1500, n_jobs=1, criterion='gini',
oob_score=True, random_state=None, verbose=0,
max_features=None, min_samples_split=10,
min_samples_leaf=1))])
#params = {'ET__n_estimators': [1500],
# 'ET__criterion': ['gini'],
# 'ET__max_features': [None],
# 'imputer__strategy': ['median'],
# 'ET__min_samples_split': [10],
# 'ET__min_samples_leaf': [1]}
#grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs = -1, cv=3, scoring='f1')
grid_search.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
#print "Best Estimator: ", grid_search.best_estimator_
f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
#print "Best Params: ", grid_search.best_params_
#print ""
print sum(f1_avg)/n_iter
In [204]:
from sklearn.svm import LinearSVC
sk_fold = StratifiedShuffleSplit(y_df, n_iter=10, test_size=0.2)
pipeline = Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=0.1)),
('feature_selection', LinearSVC()),
('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
criterion='gini', n_estimators=1500, n_jobs=1,
oob_score=True, random_state=None, verbose=0,
max_features='auto', min_samples_split=2,
min_samples_leaf=1))])
params = {'ET__n_estimators': [1500],
'ET__max_features': ['auto', None],
'ET__min_samples_split': [2, 4, 10],
'ET__min_samples_leaf': [1, 2, 5],
'feature_selection__C': [1, 10, 100],
#'ET__criterion' : ['gini', 'entropy'],
#'imputer__strategy': ['median', 'mean'],
'low_var_remover': [0, 0.1, .25, .50]}
grid_search = GridSearchCV(pipeline, param_grid=params, cv=sk_fold, n_jobs = -1, scoring='f1')
grid_search.fit(X_df, y=y_df)
#test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
#f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
#print ""
#print sum(f1_avg)/n_iter
In [ ]:
In [ ]:
In [212]:
n_iter = 100
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
grid_search.best_estimator_.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
#print "Best Estimator: ", grid_search.best_estimator_
#print f1_score(y_test, test_pred)
f1_avg.append(f1_score(y_test, test_pred))
print sum(f1_avg)/n_iter
In [276]:
from sklearn.neighbors import KNeighborsRegressor
In [277]:
income_imputer = KNeighborsRegressor(n_neighbors=1)
In [353]:
df_salary = df_50[df_50.salary.isnull()==False]
df_null_salary = df_50[df_50.salary.isnull()==True]
In [354]:
import seaborn as sns
df_salary.corr()
sns.set(style='darkgrid')
f, ax = plt.subplots(figsize=(14, 14))
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.corrplot(df_salary.corr(), annot=True, sig_stars=False,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [355]:
df_salary.corr().ix[: ,0] # Pick the first column which we are predicting.
Out[355]:
In [440]:
def kcluster_null(df=None, cols=None):
# Asssert values are passed in. Very lax check here for now.
#assert df not None, "Please prove a pandas dataframe"
#assert cols not None, "please provide a list of columns to impute"
# Create a KNN regression estimator for
income_imputer = KNeighborsRegressor(n_neighbors=1)
# Loops through the columns passed in to impute each one sequentially.
# Ideally these should be somewhat correlated since they will be used in KNN to
# predict each other, one column at a time.
for each in cols:
# Create a temp list that does not include the column being predicted.
temp_cols = [col for col in cols if col != each]
# Create a dataframe that contains no missing values in the columns being predicted.
# This will be used to train the KNN estimator
df_col = df[df[each].isnull()==False]
# Create a dataframe with all of the nulls in the column being predicted.
df_null_col = df[df[each].isnull()==True]
# Create a temp dataframe filling in the medians for each column being used to
# predict that is missing values.
# This step is needed since we have so many missing values distributed through
# all of the columns.
temp_df_medians = df_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
# Fit our KNN imputer to this dataframe now that we have values for every column.
income_imputer.fit(temp_df_medians, df_col[each])
# Fill the df (that has null values being predicted) with medians in the other
# columns not being predicted.
# ** This currently uses its own medians and should ideally use the predictor df's
# ** median values to fill in NA's of columns being used to predict.
temp_null_medians = df_null_col[temp_cols].apply(lambda x: x.fillna(x.median()), axis=0)
# Predict the null values for the current 'each' variable.
new_values = income_imputer.predict(temp_null_medians[temp_cols])
# Replace the null values of the original null dataframe with the predicted values.
df_null_col[each] = new_values
# Append the new predicted nulls dataframe to the dataframe which containined
# no null values.
# Overwrite the original df with this one containing predicted columns.
# Index order will not be preserved since it is rearranging each time by
# null values.
df = df_col.append(df_null_col)
return df.sort_index(axis=0)
In [447]:
cols = ['salary', 'other', 'total_stock_value', 'exercised_stock_options',
'total_payments', 'restricted_stock']
imputed_df = kcluster_null(df_50, cols = cols)
imputed_df.salary.plot()
Out[447]:
In [448]:
df_50_salary_done.sort_index(axis=0).salary.plot()
Out[448]:
In [450]:
# Same values as manually imputing! Yay
len(df_50_salary_done.sort_index(axis=0).salary == imputed_df.salary)
Out[450]:
In [418]:
# Find some the highest correlated columns to predict salary.
cols = ['other', 'total_stock_value', 'exercised_stock_options',
'total_payments', 'restricted_stock']
# Temporarily fill in any missing values in our predictors with the median.
# Medians will be more robust for our skewed data.
## median_imputer = Imputer(missing_values = 'NaN', axis=0)
## median_imputer.fit(df_salary[cols])
## temp_df_medians = median_imputer.transform(df_salary[cols])
#temp_df_medians = df_salary[cols].apply(lambda x: x.fillna(x.median()), axis=0)
df_col_medians = list(df_salary[cols].apply(lambda x: x.median(), axis=0))
income_imputer.fit(temp_df_medians, df_salary.salary)
Out[418]:
In [419]:
## temp_null_medians[cols] = median_imputer.transform(df_null_salary[cols])
temp_null_medians = df_null_salary[cols].apply(lambda x: x.fillna(x.median()), axis=0)
new_values = income_imputer.predict(temp_null_medians[cols])
In [420]:
df_null_salary['salary'] = new_values
In [421]:
df_50_salary_done = df_salary.append(df_null_salary)
In [422]:
df_50_salary_done.salary.plot(figsize=(14,12), label='kcluster')
#df.salary.fillna(df.salary.median()).plot()
#df.salary.plot(label='missing')
Out[422]:
In [390]:
df_50_salary_done.info()
In [392]:
df_50_salary_done.salary.nunique()
Out[392]:
In [291]:
low_var_remover = VarianceThreshold(threshold=.5)
In [336]:
# Remove columns with more than 50% NA's
df_50 = df.dropna(axis=1, thresh=len(df)/2)
# Since email_address and poi are True/False, every record should have at least 2 non-NA.
# We'll next remove any rows that don't have at least 3 non-NA values.
# This will only remove records that are completely blank except for poi/email_address.
df_50 = df_50.dropna(axis=0, thresh=3)
# One record was removed
df_50.info()
In [337]:
pd.value_counts(df_50.poi)
Out[337]:
In [338]:
df_50.corr()
sns.set(style='darkgrid')
f, ax = plt.subplots(figsize=(14, 14))
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.corrplot(df_50.corr(), annot=True, sig_stars=False,
diag_names=False, cmap=cmap, ax=ax)
f.tight_layout()
In [300]:
help(df.dropna)
In [186]:
## Add features
df2 = df.copy()
df2 = df2.apply(lambda x: x.fillna(x.median()), axis=0)
df2['EXERCISE_SALARY'] = df2['exercised_stock_options']/df2['salary']
df2['LOG_EX_STOCK_OPT'] = df2['exercised_stock_options']
df2['DEFFERED_STOCK'] = df2['deferred_income']/df2['total_stock_value']
df2['EXERCISE_TOTAL_STOCK'] = df2['exercised_stock_options']/df2['total_stock_value']
df2['EXERCISE_DEFFERED'] = df2['exercised_stock_options']/df2['deferred_income']
#df2['TWO_FROM'] = df2['from_poi_to_this_person']/df2['from_this_person_to_poi']
X_df = df2.drop('poi', axis=1)
y_df = df2['poi']
In [222]:
#print X_noNA
rf = ExtraTreesClassifier(n_estimators=2000, min_samples_split=4,
max_features=None, min_samples_leaf=1)
rf.fit(X_df, y_df)
rf.feature_importances_
Out[222]:
In [223]:
importances = rf.feature_importances_
sorted_idx = np.argsort(importances)
In [224]:
padding = np.arange(len(X_df.columns)) + 0.5
plt.barh(padding, importances[sorted_idx], align='center')
plt.yticks(padding, X_df.columns[sorted_idx])
plt.xlabel("Relative Importance")
plt.title("Variable IMportance")
plt.show()
# http://nbviewer.ipython.org/github/yhat/DataGotham2013/blob/master/notebooks/7%20-%20Feature%20Engineering.ipynb
In [225]:
pd.crosstab(df.poi, df.email_address)
Out[225]:
In [ ]:
In [ ]:
from sklearn.svm import LinearSVC
sk_fold = StratifiedShuffleSplit(y_df, n_iter=10, test_size=0.2)
pipeline = Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)),
('standardizer', StandardScaler(copy=True, with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=0.1)),
('feature_selection', LinearSVC()),
('ET', ExtraTreesClassifier(bootstrap=True, compute_importances=None,
criterion='gini', n_estimators=1500, n_jobs=1,
oob_score=True, random_state=None, verbose=0,
max_features='auto', min_samples_split=2,
min_samples_leaf=1))])
params = {'ET__n_estimators': [1500],
'ET__max_features': ['auto', None],
'ET__min_samples_split': [2, 4, 10],
'ET__min_samples_leaf': [1, 2, 5],
'feature_selection__C': [1, 10, 100],
#'ET__criterion' : ['gini', 'entropy'],
#'imputer__strategy': ['median', 'mean'],
'low_var_remover': [0, 0.1, .25, .50]}
grid_search = GridSearchCV(pipeline, param_grid=params, cv=sk_fold, n_jobs = -1, scoring='f1')
grid_search.fit(X_df, y=y_df)
#test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
#f1_avg.append(f1_score(y_test, test_pred))
#print "F1: ", f1_score(y_test, test_pred)
#print "Confusion Matrix: "
#print confusion_matrix(y_test, test_pred)
#print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
In [ ]:
n_iter = 100
sk_fold = StratifiedShuffleSplit(y_df, n_iter=n_iter, test_size=0.1)
f1_avg = []
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
grid_search.best_estimator_.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
#print "Best Estimator: ", grid_search.best_estimator_
#print f1_score(y_test, test_pred)
f1_avg.append(f1_score(y_test, test_pred))
print sum(f1_avg)/n_iter
In [ ]:
In [20]:
pipeline = Pipeline([
('imputer', Imputer(missing_values = 'NaN', axis=0)),
('standardizer', StandardScaler(with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=.10)),
# ('pca', PCA()),
('ET', ExtraTreesClassifier(oob_score=True, bootstrap=True) )
])
params = {'ET__n_estimators': [500],
'ET__max_features': ['auto', None, 3, 5, 10, 15],
'ET__min_samples_split': [2, 4, 10],
'ET__min_samples_leaf': [1, 2, 5],
'ET__criterion' : ['gini', 'entropy'],
'imputer__strategy': ['median', 'mean']}
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs = -1, cv=3, scoring='f1', verbose=1)
grid_search.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
print "F1: ", f1_score(y_test, test_pred)
print "Confusion Matrix: "
print confusion_matrix(y_test, test_pred)
print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
print ""
In [ ]:
In [ ]:
# Logistic Regression Baseline
In [55]:
from sklearn.linear_model import LogisticRegression
sk_fold = StratifiedKFold(y_df, n_folds=4, shuffle=True)
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
pipeline = Pipeline([
('imputer', Imputer(missing_values = 'NaN', strategy='median', axis=0)),
('standardizer', StandardScaler(with_mean=True, with_std=True)),
('low_var_remover', VarianceThreshold(threshold=.10)),
('LR', LogisticRegression() )
])
params = {'LR__C': [.01, 1, 10]}
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs = -1, cv=None, scoring='f1')
grid_search.fit(X_train, y=y_train)
test_pred = grid_search.predict(X_test)
#print "Cross_Val_score: ", cross_val_score(grid_search, X_train, y_train)
print "Best Estimator: ", grid_search.best_estimator_
print "F1: ", f1_score(y_test, test_pred)
print "Confusion Matrix: "
print confusion_matrix(y_test, test_pred)
print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print "Best Params: ", grid_search.best_params_
print ""
In [54]:
## Baseline braindead scores for returning the highest frequency 0 as a guess.
Out[54]:
In [40]:
sk_fold = StratifiedKFold(y_df, n_folds=4, shuffle=True)
for train_index, test_index in sk_fold:
X_train, X_test = X_df.irow(train_index), X_df.irow(test_index)
y_train, y_test = y_df[train_index], y_df[test_index]
test_pred = np.zeros(y_test.shape)
print "F1: ", f1_score(y_test, test_pred)
print "Confusion Matrix: "
print confusion_matrix(y_test, test_pred)
print "Accuracy Score: ", accuracy_score(y_test, test_pred)
print ""
In [36]:
Out[36]:
In [ ]: